First the data is gathered from the different websites.
Load libraries
library(RCurl) library(jsonlite) library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(here)
here() starts at /Users/AK/Library/Mobile Documents/com~apple~CloudDocs/Skrivebord/STUDIE/3. studieår/R for Bio Data Science/group_03_project
Create data directory
# Create 'data' directory if it doesn't existif (!dir.exists(here("data"))) {dir.create(here("data")) }# Create 'raw' directory if it doesn't existif (!dir.exists(here("data/_raw"))) {dir.create(here("data/_raw")) }
Get diabetes data
# Fetch dataseturl <-"https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip"temp <-tempfile()download.file(url, temp)# Unzip files directly into the 'data' directoryunzip(temp, exdir =here("data"))# The data lies in a subfolder, whichi we would like to removepath_a <-file.path(here("data"), "dataset_diabetes")path_b <-file.path(here("data"), "_raw")# Get the list of files with their full paths from 'path_a'my_files <-list.files(path_a, full.names =TRUE)# Copy files to 'path_b'file.copy(from = my_files, to = path_b, overwrite =TRUE)
[1] TRUE TRUE
# Delete the now-empty folderunlink(path_a, recursive =TRUE)
Get ICD9 data
# Fetch dataurl_ICD9 <-"https://www.cms.gov/medicare/coding/icd9providerdiagnosticcodes/downloads/icd-9-cm-v32-master-descriptions.zip"temp <-tempfile()download.file(url_ICD9, temp)# Unzip files directly into the '_raw' directoryunzip(temp, exdir =here("data/_raw"))# The zip-folder also contains data on surgaries, which is not relevent here. The unused files are removed unlink(here("data/_raw/CMS32_DESC_LONG_SG.txt"))unlink(here("data/_raw/CMS32_DESC_SHORT_SG.txt"))unlink(here("data/_raw/CMS32_DESC_LONG_SHORT_SG.xlSX"))unlink(here("data/_raw/CMS32_DESC_LONG_SHORT_DX.xlsx"))
Clean data
The four different files are then combined, and different types of NA values are combined.
suppressWarnings({#Data is loaded without defined column namesmeta_data <-read_csv(here("data/_raw/IDs_mapping.csv"),na ="", col_names =c("type_id", "description"), show_col_types =FALSE)# Remove NA columns and add a column based on admission type, that is numeric. The NA values in this, will be the names of the type of metadata the id describes. Also adds a meta_type column, which has the type in the cells that are NAs in the numeric type_id_as_nummeta_data <- meta_data |>na.omit() |>mutate(type_id, type_id_as_num =as.numeric(type_id)) |>mutate(meta_type =ifelse(is.na(type_id_as_num), type_id, NA))#Fills meta_type with the last non-NA cell meta_data <- meta_data |>fill(meta_type)#Removes the rows containing the names, drops admission_type_is_as_nummeta_data_clean <- meta_data |>filter(!is.na(type_id_as_num)) |>select(!type_id) |>rename(type_id = type_id_as_num)})admission_type <- meta_data_clean |>filter(meta_type =="admission_type_id") |>select(-meta_type) |>mutate(description =case_when(description %in%c("NULL", "Not Mapped", "Not Available") ~NA, TRUE~ description))discharge_disposition <- meta_data_clean |>filter(meta_type =="discharge_disposition_id") |>select(-meta_type) |>mutate(description =case_when(description %in%c("NULL", "Not Mapped", "Unknown/Invalid") ~NA, TRUE~ description))admission_source <- meta_data_clean |>filter(meta_type =="admission_source_id") |>select(-meta_type) |>mutate(description =case_when(description %in%c("NULL", "Not Mapped", "Unknown/Invalid", "Not Available") ~NA, TRUE~ description))
Clean ICD9 data
The data is downloaded here as the newest version from 2014:
From the files the short and long ds are chosen (sg are surgical and dx are diagnoses).
When running the 01_load.qmd, all the relevant files should be downloaded and placed in data/_raw
# First the data will be loaded into a dataframe of one columnICD9_short <-read_delim(here("data/_raw/CMS32_DESC_SHORT_DX.txt"), col_names ="data", show_col_types =FALSE, delim ="\n")ICD9_long <-read_delim(here("data/_raw/CMS32_DESC_LONG_DX.txt"), col_names ="data", show_col_types =FALSE, delim ="\n")#The long dataset includes é and è in for example Ménière's disease. To deal with this, the data is converted to UTF-8ICD9_long <- ICD9_long |>mutate(data =iconv(data, from ="latin1", to ="UTF-8", sub ="byte"))# The first value in each row is the ID, but the delimiter is not preserved. Each row is therefore split at the first, and everything else is merged into the second column:ICD9_short <- ICD9_short |>separate(col = data,into =c("ID", "Description_short"), extra ="merge")ICD9_long <- ICD9_long |>separate(col = data,into =c("ID", "Description_long"), extra ="merge")# The long and short dataframe, should have identical IDs, however in case there are some differences, we want all data, and therefore use full_joinICD9 <-full_join(ICD9_short, ICD9_long, by ="ID")suppressWarnings({ICD9 <- ICD9 |>mutate(ID =ifelse(str_starts(ID, "[VE]"), ID,as.double(str_c(substr(ID,1,3),".",substr(ID,4,length(ID))))))})
Clean diabetes data
diabetes_data <-read_csv(here("data/_raw/diabetic_data.csv"), na =c("?", "None"), show_col_types =FALSE)#Change NO in readmitted column to Nodiabetes_data <- diabetes_data |>mutate(readmitted, readmitted =str_replace(string = readmitted, "NO", "No"))#Change Ch in change column to Yesdiabetes_data <- diabetes_data |>mutate(change, change =str_replace(string = change, "Ch", "Yes"))
Two columns are created, one for the amount of encounters per patient and one for the total amount of days the patient has spend in the hospital within the dataset.
Rows: 101766 Columns: 53
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (43): race, gender, age, weight, admission_type, discharge_disposition, ...
dbl (10): encounter_id, patient_nbr, time_in_hospital, num_lab_procedures, n...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Encounters in dataset
Add columns that signifies the amount of encounters one patient has in the data set.
Rows: 101766 Columns: 55
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (43): race, gender, age, weight, admission_type, discharge_disposition, ...
dbl (12): encounter_id, patient_nbr, nr_encounters, time_in_hospital, total_...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# A tibble: 6 × 2
# Groups: race [6]
race n
<chr> <int>
1 AfricanAmerican 12932
2 Asian 517
3 Caucasian 53601
4 Hispanic 1534
5 Other 1209
6 <NA> 1977
Visualizations
The data is visualized to discover trends.
Libraries
library(tidyverse)library(here)
Load data
data <-read.csv(here("data/03_dat_aug.csv"))
Age distribution
data |>group_by(age, gender) |>summarise(average_diagnoses =mean(number_diagnoses), .groups ="drop") |>ggplot(aes(x = age,y = average_diagnoses,fill = gender)) +geom_col(position ="dodge") +labs(x ="Age Group",y ="Average number of diagnoses",fill ="Gender",title ="Average number of diagnoses for all age groups and genders per encounter") +scale_fill_manual(values=c('hotpink','cornflowerblue','grey35'))
Age distribution, shown by race
data |>mutate(race =ifelse(is.na(race),'Other',race)) |>mutate(race =ifelse(race =='Other', 'Other/Unspecified', race)) |>group_by(age, gender, race) |>summarize(average_diagnoses =mean(number_diagnoses), .groups ="drop") |>ggplot(mapping =aes(x = age,y = average_diagnoses,fill = gender)) +geom_col(position ="dodge", alpha =1.0) +facet_wrap(~ race) +labs(x ="Age Group",y ="Average number of diagnoses",fill ="Gender",title ="Average number of diagnoses for all age groups and genders,\nfacet wrapped for race") +scale_fill_manual(values=c('hotpink','cornflowerblue','grey35')) +theme_minimal() +theme(axis.text.x =element_text(angle =60, hjust =1),legend.position ='inside',legend.position.inside =c(0.85,0.15))
#Finding the 8 most frequent discharge disposition for any admission typetop_8_discharge_dispositions <- data |>group_by(discharge_disposition)|>summarize(n =n()) |>#Summarizing the total amount of occurences of each discharge typearrange(desc(n)) |>#Arranging from high -> low head(8) |>select(discharge_disposition) |>pull() #Converting into a vectordata |>select(discharge_disposition, age) |>filter(discharge_disposition %in% top_8_discharge_dispositions) |>#Filtering for only the cases where the discharge type was in the top 8 group_by(discharge_disposition, age) |>summarize(n =n(),.groups ='keep') |>drop_na(discharge_disposition) |>ggplot(mapping =aes(x = discharge_disposition,y = n,fill = age) ) +geom_col(position ="dodge") +scale_fill_viridis_d(option ="viridis") +scale_x_discrete(labels =function(x) str_wrap(x, width =35)) +labs(x ="Discharge disposition",y ="Count",fill ="Age group",title ="Discharge disposition for all age groups for all admission types" ) +theme_minimal() +theme(axis.text.x =element_text(angle =60, hjust =1) )
Only for emergency admission type
#Finding the top 8 for the casses related to emergency admissions onlytop_8_discharge_dispositions <- data |>filter(admission_type =="Emergency") |>group_by(discharge_disposition)|>summarize(n =n()) |>arrange(desc(n)) |>head(8) |>select(discharge_disposition) |>pull()data |>filter(admission_type =="Emergency") |>select(discharge_disposition, age) |>filter(discharge_disposition %in% top_8_discharge_dispositions) |>group_by(discharge_disposition, age) |>summarize(n =n(),.groups ='keep') |>drop_na(discharge_disposition) |>ggplot(mapping =aes(x = discharge_disposition,y = n,fill = age) ) +geom_col(position ="dodge") +scale_fill_viridis_d(option ="viridis") +scale_x_discrete(labels =function(x) str_wrap(x, width =35)) +labs(x ="Discharge disposition",y ="Count",fill ="Age group",title ="Discharge disposition for all age groups for emergency admissions" ) +theme_minimal() +theme(axis.text.x =element_text(angle =60, hjust =1) )
diabetes_count |>mutate(type =ifelse(str_detect(all_diagnoses,'type I '),'Type I','Type II or unspecified'),uncontrolled =ifelse(str_detect(all_diagnoses,', uncontrolled'), 'Uncontrolled', 'Not specified as uncontrolled'),no_complication =ifelse(str_detect(all_diagnoses, 'without mention of complication'),'No complication','Complications')) |>mutate(complication_type =ifelse(no_complication =='Complications',str_to_sentence(str_extract(all_diagnoses,'(?<=with ).*(?=, *type)')),'None')) |>mutate(complication_type =ifelse(is.na(complication_type),'Pregnancy complications',complication_type)) |>mutate(complication_type =factor(complication_type, levels =c(unique(complication_type[complication_type !='None']), 'None'))) |>ggplot(aes(x = uncontrolled, y=count, fill = complication_type)) +geom_bar(stat='identity',) +facet_wrap(~type) +labs(title ='Diabetes managment and complications',subtitle =str_c('For patients with diabetes as primary or secondary diagnosis (',sum(diabetes_count$count),' patients)'),x ='Controllation of disease', y ='Number of instances', fill ='Complications') +theme_minimal() +scale_x_discrete(labels =function(x) str_wrap(x,width =17)) +scale_fill_viridis_d(option ='viridis',direction=-1)
Only for people with complications
diabetes_count |>mutate(type =ifelse(str_detect(all_diagnoses,'type I '),'Type I','Type II or unspecified'),uncontrolled =ifelse(str_detect(all_diagnoses,', uncontrolled'), 'Uncontrolled', 'Not specified as uncontrolled'),no_complication =ifelse(str_detect(all_diagnoses, 'without mention of complication'),'No complication','Complications')) |>mutate(complication_type =ifelse(no_complication =='Complications',str_to_sentence(str_extract(all_diagnoses,'(?<=with ).*(?=, *type)')),'None')) |>mutate(complication_type =ifelse(is.na(complication_type),'Pregnancy complications',complication_type)) |>filter(!complication_type =='None') |>ggplot(aes(x = uncontrolled, y=count, fill = complication_type)) +geom_bar(stat='identity',) +facet_wrap(~type) +labs(title ='Diabetes managment and complications',subtitle =str_c('For patients with diabetes as primary or secondary diagnosis\n(only for cases with complications: ', diabetes_count |>filter(!str_detect(all_diagnoses, 'without mention of complication')) |>select(count) |>sum(),' patients)'),x ='Controllation of disease', y ='Number of instances', fill ='Complications') +theme_minimal() +scale_fill_viridis_d(option ='plasma',direction=-1) +scale_x_discrete(labels =function(x) str_wrap(x,width =17))
sidebarLayout( sidebarPanel( # File upload to load the data fileInput(“data_file”, “Upload Dataset”, accept = “.csv”),
# Filter for readmission status
selectInput(
"readmission_filter",
"Filter by Readmission Status:",
choices = c("All", unique(df$readmitted)),
selected = "All"
),
selectInput(
"age",
"Filter by age group:",
choices = c("All", sort(unique(df$age))),
selected = "All"
),
selectInput(
"race",
"Filter by race:",
choices = c("All", sort(unique(df$race))),
selected = "All"
),
selectInput(
"gender",
"Filter by gender",
choices = c("All", sort(unique(df$gender))),
selected = "All"
),
# Checkboxes for medications
checkboxGroupInput(
"medication_filter",
"Filter by Medications:",
choices = c(
"Metformin", "Repaglinide", "Nateglinide", "Chlorpropamide",
"Glimepiride", "Acetohexamide", "Glipizide", "Glyburide",
"Tolbutamide", "Pioglitazone", "Rosiglitazone", "Acarbose",
"Miglitol", "Troglitazone", "Tolazamide", "Examide",
"Sitagliptin", "Insulin", "Glyburide-Metformin",
"Glipizide-Metformin", "Glimepiride-Pioglitazone",
"Metformin-Rosiglitazone", "Metformin-Pioglitazone"
),
selected = NULL
)
),
mainPanel(
# Display a bar plot
plotOutput("readmissionPlot")
)
) )
library(shiny)
Define server logic
server <- function(input, output) {
# Reactive data loading dataset <- reactive({ req(input\(data_file) # Ensure a file is uploaded
load_data(input\)data_file$datapath) # Call the load_data function })
# Reactive filtering filtered_data <- reactive({ data <- dataset()
# Filter by readmission status
if (input$readmission_filter != "All") {
data <- data[data$Readmitted == input$readmission_filter, ]
}
# Filter by medications
if (!is.null(input$medication_filter)) {
for (med in input$medication_filter) {
if (med %in% colnames(data)) { # Only filter if column exists
data <- data[data[[med]] == 1, ]
} else {
warning(paste("Medication column", med, "not found in dataset"))
}
}
}
if (nrow(data) == 0) {
return(NULL) # Handle empty results gracefully
}
data
})
# Render the bar plot output$readmissionPlot <- renderPlot({ data <- filtered_data() if (is.null(data)) { plot.new() text(0.5, 0.5, “No data to display after filtering”, cex = 1.5) return() }
# Count the number of patients by readmission status
readmission_counts <- table(data$Readmitted)
# Create a bar plot
barplot(
readmission_counts,
main = "Readmission Status",
ylab = "Number of Patients",
xlab = "Readmission",
col = c("skyblue", "orange", "lightgreen")
)